* David Sturrock
* July 2018
* Uses life table and actual death data in ELSA to create an ELSA fitted survival curve *

clear
clear matrix
clear mata
set maxvar 30000


/********************************************************************/
/* Arrange life table data: create by age-sex-cohort hazard rates   */
/********************************************************************/

	* import male data *
	import excel using "$onsdata\ONS life tables 1841 onwards.xls", sheet("Males cohort lx") cellrange(A9:DN120) firstrow clear
	rename ageyears age
	local cohort=1899
	foreach var of varlist B-DN {
		local cohort=`cohort'+1
		rename `var' cohort`cohort'
		}
	drop if age < 50
	reshape long cohort , i(age) j(yob)
	rename cohort survive
	reshape wide survive , i(yob) j(age)
	gen sex = 1
	save "$temp\male_hazards.dta", replace
	
	* import female data *
	import excel using "$onsdata\ONS life tables 1841 onwards.xls", sheet("Females cohort lx") cellrange(A9:DN120) firstrow clear
	rename ageyears age
	local cohort=1899
	foreach var of varlist B-DN {
		local cohort=`cohort'+1
		rename `var' cohort`cohort'
		}
	drop if age < 50
	reshape long cohort , i(age) j(yob)
	rename cohort survive
	reshape wide survive , i(yob) j(age)
	gen sex = 2
	save "$temp\female_hazards.dta", replace
	
	append using "$temp\male_hazards.dta"

	* create hazard rates *
	forv age = 50(1)109 {
		local k = `age' +1
		gen ons_haz_`age' = 1- survive`k'/survive`age'
		}
	rename yob dobyear
	keep dobyear sex ons*
	reshape long ons_haz_, i(dobyear sex) j(age)
	rename ons_haz_ ons_haz
	save "$temp\ons_hazards.dta", replace
	
	* save survival curves *
	use  "$temp\female_hazards.dta", clear
	append using "$temp\male_hazards.dta"
	expand 61
	bys yob sex : gen age = _n +49
	forv myage = 50(1)110 {
		forv age = 110(-1)`myage' {
		replace survive`age' = survive`age'/survive`myage' if age == `myage'
		}
	}
	forv age = 50(1)110 {
		replace survive`age' = 1 if `age' < age
		}
	
	rename yob dobyear
	rename survive* prob*
	save "$temp\ons_survival.dta", replace

/***********************************************************/
/* Create survival curves in data		   */
/***********************************************************/

* Calculate proportion that die at each age in each sex-cohort group *

* create data set with death rates **

clear 
use "$savedata\wave1-7base.dta"


forvalues s = 1(1)2 {
	
		forv c = 1920(1)1959 {

		set more off
		preserve

		count if dobyear == `c' & sex == `s' & inrange(age,50,110)

			if r(N) >0 {

			* keep selected individuals *
			keep if dobyear == `c' & sex == `s'
			keep if inrange(age,50,110)
			keep if inrange(wave,1,5)

			* keep first observations of each individual *
			bys idauniq : egen first_wave = min(wave)
			keep if wave == first_wave
			
			* remove the new date of death info that records deaths after Feb 2013 *
			replace dodyr = . if dodyr > 2013
			replace dodyr = . if dodyr == 2013 & !inrange(dodmnth,1,2)
			replace dodmnth = . if dodyr == .

			* generate age that died *
			gen age_died = dodyr - year(dob) 
			replace age_died = age_died -1 if dodmnth < month(dob)

			* generate oldest age that could have lived to end of - latest month of death observed is Feb 2013 *
			gen oldest_poss_death = 2012 - dobyear - 1
			replace oldest_poss_death = oldest_poss_death + 1 if inrange(month(dob),1,2)

			su age
			local min = r(min) + 1
			su oldest_poss_death
			local max = r(max)

			* note that known_dead is "we know they are dead AND we would know if they were still alive" *
			* we want to record whether alive or dead only for years where their death would have been 
			* recorded in our data if it had happened at any point in that age year. i.e. clock starts at first
			* birthday after joining ELSA and stops at last on or birthday before Feb 2013. *
			forv age = `min'(1)`max' {
			* known alive requires that don't die at that age, were first observed at an age older than that age, and that would
			* got all the way 'through' that age before feb 2013 *
			gen known_alive_`age' = `age' < age_died & `age' > age & `age' <= oldest_poss_death
			gen known_dead_`age' = age_died <= `age' & `age' > age & `age' <= oldest_poss_death 
			gen died_`age' = age_died == `age' & `age' <= oldest_poss_death
			replace known_dead_`age' = . if `age' > oldest_poss_death | `age' <= age
			replace known_alive_`age' = . if `age' > oldest_poss_death | `age' <= age
			replace died_`age' = . if `age' > oldest_poss_death
			gen obs_`age' = known_alive_`age' ==1 | died_`age' ==1
			}

		collapse (mean) *alive_* *dead_* died* cohort5 (sum) obs_*
			
		* we construct a survival curve by making hazard rates at each age then getting implied survival curve *
		local j = `min' - 1
		gen surv_curv_`j' = 1
		forv age = `min'(1)`max' {
			local k = `age' - 1
			gen haz_`age' = died_`age'/(known_alive_`age' + died_`age')
			gen surv_curv_`age' = surv_curv_`k'*(1-haz_`age')
			}
			
		gen dobyear = `c'
		save "$temp\survival_curve_c`c'_sex`s'.dta", replace
		}
	restore
	}
}


	* append data and reshape *
	forv s = 1(1)2 {
		clear
		use "$temp\survival_curve_c1920_sex`s'.dta"
		forv c = 1921(1)1959 {
			append using "$temp\survival_curve_c`c'_sex`s'.dta"
			}
		keep haz* dobyear obs*
		reshape long haz_ obs_, i(dobyear) j(age)
		rename haz_ hazard 
		rename obs_ obs
		replace hazard = . if obs < 40
		save "$temp\hazards_sex`s'.dta", replace
		}



	* Calculate scaling factor for each sex *
	forv s = 1(1)2 {
		clear
		use  "$temp\hazards_sex`s'.dta"
		gen sex = `s'
		merge 1:1 dobyear sex age using "$temp\ons_hazards.dta"
		keep if sex == `s'
		collapse (mean) hazard ons_haz [fw = obs], by(age)
		export excel using "$temp\elsa_ons_period.xls", sheetreplace sheet("Sex `s'") firstrow(variables)
		gen age2 = age*age
		gen age3 = age2*age
		reg hazard age age2 age3
		predict haz_hat
		label variable age "Age"
		gen scale_smooth = haz_hat/ons_haz
		gen scale = hazard/ons_haz
		collapse (mean) scale_smooth scale
		gen sex = `s'
		save "$temp\scale_factor_`s'.dta", replace
		}

	* scale ONS curves in line with average proportion of ELSA to ONS hazards
	use "$temp\ons_hazards.dta"
	merge m:1 sex using "$temp\scale_factor_1.dta"
	drop _m
	merge m:1 sex using "$temp\scale_factor_2.dta", update replace
	drop _m
	gen scale_haz = ons_haz*scale_smooth
	save "$temp\scaled_hazards.dta", replace 

	forv s = 1(1)2 {
			clear
			use "$temp\scaled_hazards.dta"
			keep if sex == `s'
			keep age dobyear scale_*
			reshape wide scale_haz, i(age) j(dobyear)
			set obs 61
			gen num = _n
			replace age = 110 if num == 61
			drop num
			sort age
			forv c = 1900(1)1959 {
				gen surv_curv_scaled_`c' = 1
				forv age = 51(1)110 {
				replace surv_curv_scaled_`c' = surv_curv_scaled_`c'[_n-1]*(1-scale_haz`c'[_n-1]) if `age' == age
				}
			}
			reshape long surv_curv_scaled_, i(age) j(dobyear)
			gen sex = `s'
			save "$temp\scaled_surv_`s'", replace
			}

	clear 
	use "$temp\scaled_surv_1"
	append using "$temp\scaled_surv_2"
	keep age sex dobyear surv_curv_scaled
	sort sex dobyear age
	export excel using "$temp\scaled_life_tables.xls", sheetreplace sheet("Scaled_curves") firstrow(variables)

	*** get into format for calculating annuity rates * **
	rename surv_curv_scaled_ prob
	* reshape into wide survival curves
	reshape wide prob , i(sex dobyear) j(age)
	* duplicate so that can make survival curve for each age, not just 50
	expand 61
	bys dobyear sex : gen num = _n
	gen age = 49 + num
	* alter survival curve *
	gen prob_myage = .	
	forv age = 50(1)110 {
		replace prob_myage = prob`age' if age == `age'
		}
	forv age = 50(1)110 {
		replace prob`age' = prob`age'/prob_myage
		replace prob`age' = . if `age' < age
		}
	keep sex dobyear age prob*
	save "$temp\prob_survive_scaled.dta", replace
